{
  "cells": [
    {
      "cell_type": "code",
      "execution_count": 25,
      "source": [
        "import pandas as pd\r\n",
        "import numpy as np"
      ],
      "outputs": [],
      "metadata": {}
    },
    {
      "cell_type": "code",
      "execution_count": 26,
      "source": [
        "def optimize_memory_usage(df, print_size=True):\r\n",
        "    # Function optimizes memory usage in dataframe.\r\n",
        "    # (RU) Функция оптимизации типов в dataframe.\r\n",
        "    \r\n",
        "    # Types for optimization.\r\n",
        "    # Типы, которые будем проверять на оптимизацию.\r\n",
        "    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']\r\n",
        "    # Memory usage size before optimize (Mb).\r\n",
        "    # (RU) Размер занимаемой памяти до оптимизации (в Мб).\r\n",
        "    before_size = df.memory_usage().sum() / 1024**2    \r\n",
        "    for column in df.columns:\r\n",
        "        column_type = df[column].dtypes\r\n",
        "        if column_type in numerics:\r\n",
        "            column_min = df[column].min()\r\n",
        "            column_max = df[column].max()\r\n",
        "            if str(column_type).startswith('int'):\r\n",
        "                if column_min > np.iinfo(np.int8).min and column_max < np.iinfo(np.int8).max:\r\n",
        "                    df[column] = df[column].astype(np.int8)\r\n",
        "                elif column_min > np.iinfo(np.int16).min and column_max < np.iinfo(np.int16).max:\r\n",
        "                    df[column] = df[column].astype(np.int16)\r\n",
        "                elif column_min > np.iinfo(np.int32).min and column_max < np.iinfo(np.int32).max:\r\n",
        "                    df[column] = df[column].astype(np.int32)\r\n",
        "                elif column_min > np.iinfo(np.int64).min and column_max < np.iinfo(np.int64).max:\r\n",
        "                    df[column] = df[column].astype(np.int64)  \r\n",
        "            else:\r\n",
        "                if column_min > np.finfo(np.float32).min and column_max < np.finfo(np.float32).max:\r\n",
        "                    df[column] = df[column].astype(np.float32)\r\n",
        "                else:\r\n",
        "                    df[column] = df[column].astype(np.float64)    \r\n",
        "    # Memory usage size after optimize (Mb).\r\n",
        "    # (RU) Размер занимаемой памяти после оптимизации (в Мб).\r\n",
        "    after_size = df.memory_usage().sum() / 1024**2\r\n",
        "    if print_size: print('Memory usage size: before {:5.4f} Mb - after {:5.4f} Mb ({:.1f}%).'.format(before_size, after_size, 100 * (before_size - after_size) / before_size))\r\n",
        "    return df"
      ],
      "outputs": [],
      "metadata": {}
    },
    {
      "cell_type": "code",
      "execution_count": 30,
      "source": [
        "def import_data_from_csv(filePath):\r\n",
        "    # Load a dataframe from csv-file and optimize its memory usage.\r\n",
        "    # (RU) Загрузка данных из csv-файла и оптимизация числовых типов для оптимизации использования памяти\r\n",
        "    df = pd.read_csv(filePath, parse_dates=True, keep_date_col=True)\r\n",
        "    # Show dataframe info before optimize.\r\n",
        "    # (RU) Показать информацию о таблице до оптимизации.\r\n",
        "    print('-' * 80)\r\n",
        "    print(df.info())\r\n",
        "    print('-' * 80)\r\n",
        "    # (RU) Оптимизация типов в dataframe.\r\n",
        "    df = optimize_memory_usage(df)\r\n",
        "    # Show dataframe info after optimize.\r\n",
        "    # (RU) Показать информацию о таблице после оптимизации.\r\n",
        "    print('-' * 80)\r\n",
        "    print(df.info())\r\n",
        "    print('-' * 80)\r\n",
        "    return df"
      ],
      "outputs": [],
      "metadata": {}
    },
    {
      "cell_type": "code",
      "execution_count": 31,
      "source": [
        "df = import_data_from_csv('../202110 open/Airport/Science.csv')"
      ],
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "--------------------------------------------------------------------------------\n",
            "<class 'pandas.core.frame.DataFrame'>\n",
            "RangeIndex: 157 entries, 0 to 156\n",
            "Data columns (total 3 columns):\n",
            " #   Column      Non-Null Count  Dtype  \n",
            "---  ------      --------------  -----  \n",
            " 0   Passengers  145 non-null    float64\n",
            " 1   Year        157 non-null    int64  \n",
            " 2   Month       157 non-null    int64  \n",
            "dtypes: float64(1), int64(2)\n",
            "memory usage: 3.8 KB\n",
            "None\n",
            "--------------------------------------------------------------------------------\n",
            "Memory usage size: before 0.0037 Mb - after 0.0012 Mb (68.5%).\n",
            "--------------------------------------------------------------------------------\n",
            "<class 'pandas.core.frame.DataFrame'>\n",
            "RangeIndex: 157 entries, 0 to 156\n",
            "Data columns (total 3 columns):\n",
            " #   Column      Non-Null Count  Dtype  \n",
            "---  ------      --------------  -----  \n",
            " 0   Passengers  145 non-null    float32\n",
            " 1   Year        157 non-null    int16  \n",
            " 2   Month       157 non-null    int8   \n",
            "dtypes: float32(1), int16(1), int8(1)\n",
            "memory usage: 1.2 KB\n",
            "None\n",
            "--------------------------------------------------------------------------------\n"
          ]
        }
      ],
      "metadata": {}
    },
    {
      "cell_type": "code",
      "execution_count": 32,
      "source": [
        "df.head()"
      ],
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>Passengers</th>\n",
              "      <th>Year</th>\n",
              "      <th>Month</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>1235.750000</td>\n",
              "      <td>2007</td>\n",
              "      <td>4</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>1487.530029</td>\n",
              "      <td>2007</td>\n",
              "      <td>3</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>1563.959961</td>\n",
              "      <td>2007</td>\n",
              "      <td>7</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>1575.270020</td>\n",
              "      <td>2007</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>1762.000000</td>\n",
              "      <td>2007</td>\n",
              "      <td>8</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ],
            "text/plain": [
              "    Passengers  Year  Month\n",
              "0  1235.750000  2007      4\n",
              "1  1487.530029  2007      3\n",
              "2  1563.959961  2007      7\n",
              "3  1575.270020  2007      0\n",
              "4  1762.000000  2007      8"
            ]
          },
          "metadata": {},
          "execution_count": 32
        }
      ],
      "metadata": {}
    },
    {
      "cell_type": "markdown",
      "source": [
        "As we can see, the data frame types were float64 (1) and int64 (2), but after optimization they became float32 (1), int16 (1) and int8 (1).\r\n",
        "\r\n",
        "(RU) Видно, что при загрузке dataframe были назначены типы - float64(1) и int64(2), а после оптимизации они преобразованы в float32(1), int16(1) и int8(1)."
      ],
      "metadata": {}
    }
  ],
  "metadata": {
    "orig_nbformat": 4,
    "language_info": {
      "name": "python",
      "version": "3.9.7",
      "mimetype": "text/x-python",
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "pygments_lexer": "ipython3",
      "nbconvert_exporter": "python",
      "file_extension": ".py"
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3.9.7 64-bit"
    },
    "interpreter": {
      "hash": "dc975e5bba14374f9fc3fe829ba6299449c2760668c85f92aa0e8eac50bb3207"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 2
}